Ran the jupyter notebook in Google Colab
Here, we shall import all the required libraries to perform operations for data
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
import warnings
warnings.filterwarnings("ignore")
from sklearn import preprocessing
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.model_selection import GridSearchCV
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.display import Image
import pydotplus
import graphviz
We read the csv file given into a dataframe
data=pd.read_csv('/content/bank-full.csv')
print('The shape of the data is',data.shape)
data.columns
print('the data type of each attribute is : ')
print(" "*50)
data.info()
data.isnull().sum()
null=data[data.isnull().any(1)]
print(null)
data.describe().T
data.head(10)
sns.pairplot(data)
Observation: As we can see, there is no linear relationship between any two attributes in the paiplot above
This column is about the age of customer
print("The 5 point summary of age column would give the below results: ")
data['age'].describe().T
plt.figure(figsize=(15,10))
sns.set_color_codes()
ax = sns.distplot(data['age'], color="r")
ax = sns.distplot(data['age'], rug=True, rug_kws={"color": "g"},
kde_kws={"color": "k", "lw": 3, "label": "KDE"},
hist_kws={"histtype": "step", "linewidth": 3,
"alpha": 1, "color": "g"})
Observation : From the above two plots ( dstplot and kde plot), we can say that the age is approximately normally distributed
plt.figure(figsize=(20,5))
sns.countplot(data['age'])
Looking for outliers in age column
plt.figure(figsize=(15,10))
sns.boxplot(data['age'])
plt.figure(figsize=(15,5))
sns.boxplot(y=data['age'],x=data['Target'])
plt.figure(figsize=(15,5))
sns.violinplot(y=data['age'],x=data['Target'])
Observation : Even though there are outliers in age column, they cannot be ignored because many of the aged clients may subscribe to the term deposit
Lets do statistical analysis whether age is related to the client subscribed a term deposit or not.
Question: Is age related to the client subscriBING a term deposit or not. ?
print("Individual count of age column: "," "*500, data['age'].value_counts())
NULL HYPOTHESIS(H0): Age is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): Age is related to the client subscribing a term deposit
# Level of significance = 0.05
null = 'Age is not related to the client subscribing a term deposit'
alternate = 'Age is related to the client subscribing a term deposit'
a=np.array(data[data['Target']=='yes'].age)
b=np.array(data[data['Target']=='no'].age)
t_stat, p_value = stats.ttest_ind(a,b,axis=0)
print("the caluclated value of tstatistic is",t_stat)
print("The pvalue is",p_value)
print(""*1000)
print("Based on the statistical evidence")
if p_value > 0.05:
print("we fail to reject null hypothesis as the p_value", p_value, "is greater than 0.05")
print(null)
else:
print("we reject null hypothesis as the p_value", p_value, "is less than 0.05")
print(alternate)
This column is about the type of job of the client or customer
print("Different jobs been done by customers:", '\n',data['job'].unique())
print("Individual count of job column: ")
data['job'].value_counts()
plt.figure(figsize=(15,10))
plt.title("count plot of job column")
sns.countplot(data['job'])
print("Individual count of job column who subscribed to term deposit: ")
data[data['Target']=='yes'].job.value_counts()
plt.figure(figsize=(18,7))
plt.title("Individual count plot of job column who subscribed to term deposit")
sns.countplot(data[data['Target']=='yes'].job)
Observation: Among those who subscribed to term deposit, management related job holders are the highest followed by technician
print("Individual count of job column who did not subscribed to term deposit: ")
data[data['Target']=='no'].job.value_counts()
plt.figure(figsize=(18,7))
plt.title("Individual count plot of job column who have not subscribed to term deposit")
sns.countplot(data[data['Target']=='no'].job)
Observation: Among those who did not subscribed to term deposit, blue collar related job holders are the highest followed by management and technician
Lets do statistical analysis whether type of job is related to the client subscribed a term deposit or not.
Question: Is type of job related to the client subscriBING a term deposit or not. ?
NULL HYPOTHESIS(H0): Type of Job is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): Type of job is related to the client subscribing a term deposit
# Level of significance = 0.05
import numpy as np
import pandas as pd
s = pd.crosstab(data.job, data.Target, margins = False)
print(s)
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2
# contingency table
table = s
print("Contingency table:")
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print("Note : dof = (rows-1)*(cols-1) = (12-1)*(2-1)")
print(expected)
# interpret test-statistic
null = 'Type of job is not related to the client subscribing a term deposit'
alternate = 'Type of job is related to the client subscribing a term deposit'
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
This column states the marital status of the client
print("Marital status in the dataset:", '\n',data['marital'].unique())
print("Individual count of marital column: ", data['marital'].value_counts())
plt.figure(figsize=(15,10))
plt.title("count plot of marital column")
sns.countplot(data['marital'])
print("Individual count of marital column who subscribed to term deposit: "," "*10, data[data['Target']=='yes'].marital.value_counts())
plt.figure(figsize=(18,7))
plt.title("count of marital column who subscribed to term deposit")
sns.countplot(data[data['Target']=='yes'].marital)
print("Individual count of marital column who did not subscribed to term deposit: ")
data[data['Target']=='no'].marital.value_counts()
plt.figure(figsize=(18,7))
plt.title("count of marital column who did not subscribed to term deposit")
sns.countplot(data[data['Target']=='no'].marital)
Observation:
Lets do statistical analysis whether type of marital status is related to the client subscribed a term deposit or not.
Question: Is marital status related to the client subscriBING a term deposit or not. ?
NULL HYPOTHESIS(H0): Marital status is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): Marital status is related to the client subscribing a term deposit
# Level of significance = 0.05
import numpy as np
import pandas as pd
s = pd.crosstab(data.marital, data.Target, margins = False)
print(s)
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2
# contingency table
table = s
print("Contingency table:")
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print("Note : dof = (rows-1)*(cols-1) = (3-1)*(2-1)")
print(expected)
# interpret test-statistic
null = 'Marital status is not related to the client subscribing a term deposit'
alternate = 'Marital status is related to the client subscribing a term deposit'
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
This column states the education level of the client
print("Education did by customers:", '\n',data['education'].unique())
print("Individual count of education column: "," "*5, data['education'].value_counts())
plt.figure(figsize=(15,10))
plt.title("count plot of education column")
sns.countplot(data['education'])
print("Individual count of Education column who subscribed to term deposit: "," "*5, data[data['Target']=='yes'].education.value_counts())
plt.figure(figsize=(18,7))
plt.title("count plot of Education column who subscribed to term deposit")
sns.countplot(data[data['Target']=='yes'].education)
print("Individual count of education column who did not subscribed to term deposit: "," "*4, data[data['Target']=='no'].education.value_counts())
plt.figure(figsize=(18,7))
plt.title("count plot of Education column who did not subscribed to term deposit")
sns.countplot(data[data['Target']=='no'].education)
Observation:
Lets do statistical analysis whether type of education is related to the client subscribed a term deposit or not.
Question: Is level of education related to the client subscriBING a term deposit or not. ?
NULL HYPOTHESIS(H0): level of education is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): level of education is related to the client subscribing a term deposit
# Level of significance = 0.05
import numpy as np
import pandas as pd
s = pd.crosstab(data.education, data.Target, margins = False)
print(s)
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2
# contingency table
table = s
print("Contingency table:")
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print("Note : dof = (rows-1)*(cols-1) = (4-1)*(2-1)")
print(expected)
# interpret test-statistic
null = 'level of education is not related to the client subscribing a term deposit'
alternate = 'level of education is related to the client subscribing a term deposit'
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
This column tells whether the customer has default in credit or not
print("Unique values of default column:", '\n',data['default'].unique())
print("Individual count of default column: ")
data['default'].value_counts()
plt.figure(figsize=(15,10))
plt.title("count plot of default column")
sns.countplot(data['default'])
print("Individual count of default column who subscribed to term deposit: ")
data[data['Target']=='yes'].default.value_counts()
plt.figure(figsize=(18,7))
plt.title("count plot of default column who subscribed to term deposit")
sns.countplot(data[data['Target']=='yes'].default)
print("Individual count of default column who did not subscribed to term deposit: ")
data[data['Target']=='no'].default.value_counts()
plt.figure(figsize=(18,7))
plt.title("count plot of default column who did not subscribed to term deposit")
sns.countplot(data[data['Target']=='no'].default)
Observations:
Lets do statistical analysis whether type of existence of credit in default is related to the client subscribed a term deposit or not.
Question: Is existence of credit in default related to the client subscriBING a term deposit or not. ?
NULL HYPOTHESIS(H0): existence of credit in default is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): existence of credit in default is related to the client subscribing a term deposit
# Level of significance = 0.05
import numpy as np
import pandas as pd
s = pd.crosstab(data.default, data.Target, margins = False)
print(s)
# chi-squared test with similar proportions
from scipy.stats import chi2_contingency
from scipy.stats import chi2
# contingency table
table = s
print("Contingency table:")
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print("Note : dof = (rows-1)*(cols-1) = (2-1)*(2-1)")
print(expected)
# interpret test-statistic
null = 'existence of credit in default is not related to the client subscribing a term deposit'
alternate = 'existence of credit in default is related to the client subscribing a term deposit'
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
This column is about the average yearly balance in euros of customer
print("The 5 point summary of balance column would give the below results: ")
data['balance'].describe().T
plt.figure(figsize=(15,10))
sns.set_color_codes()
ax = sns.distplot(data['balance'], color="r")
plt.figure(figsize=(15,10))
ax = sns.distplot(data['balance'], rug=True, rug_kws={"color": "g"},
kde_kws={"color": "k", "lw": 3, "label": "KDE"},
hist_kws={"histtype": "step", "linewidth": 3,
"alpha": 1, "color": "g"})
Looking for outliers in age column
plt.figure(figsize=(15,10))
sns.boxplot(data['balance'])
plt.figure(figsize=(15,5))
sns.boxplot(y=data['balance'],x=data['Target'])
plt.figure(figsize=(15,5))
sns.violinplot(y=data['balance'],x=data['Target'])
Observations:
Lets do statistical analysis whether age is related to the client subscribed a term deposit or not.
Question: Is age related to the client subscriBING a term deposit or not. ?
print("Individual count of balance column: "," "*500, data['balance'].value_counts())
NULL HYPOTHESIS(H0): Avg balance is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): Avg balance is related to the client subscribing a term deposit
# Level of significance = 0.05
null = 'Avg balance is not related to the client subscribing a term deposit'
alternate = 'Avg balance is related to the client subscribing a term deposit'
a=np.array(data[data['Target']=='yes'].balance)
b=np.array(data[data['Target']=='no'].balance)
t_stat, p_value = stats.ttest_ind(a,b,axis=0)
print("the caluclated value of tstatistic is",t_stat)
print("The pvalue is",p_value)
print(""*1000)
print("Based on the statistical evidence")
if p_value > 0.05:
print("we fail to reject null hypothesis as the p_value", p_value, "is greater than 0.05")
print(null)
else:
print("we reject null hypothesis as the p_value", p_value, "is less than 0.05")
print(alternate)
This column is about whether the customer has housing loan or not
print("Unique values of Housing column:", '\n',data['housing'].unique())
print("Individual count of housing column: ")
data['housing'].value_counts()
plt.figure(figsize=(15,10))
plt.title("count plot of housing column")
sns.countplot(data['housing'])
print("Individual count of housing column who subscribed to term deposit: ")
data[data['Target']=='yes'].housing.value_counts()
plt.figure(figsize=(18,7))
plt.title("count plot of housing column who subscribed to term deposit")
sns.countplot(data[data['Target']=='yes'].housing)
print("Individual count of housing column who did not subscribed to term deposit: ")
data[data['Target']=='no'].housing.value_counts()
plt.figure(figsize=(18,7))
plt.title("count plot of housing column who did not subscribed to term deposit")
sns.countplot(data[data['Target']=='no'].housing)
Observation: Most of them who subscribed to a deposit have no housing loan and this is based on the count plot shown above
Lets do statistical analysis whether having housing loan is related to the client subscribed a term deposit or not.
Question: Is having housing loan related to the client subscriBING a term deposit or not. ?
NULL HYPOTHESIS(H0): having housing loan is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): having housing loan is related to the client subscribing a term deposit
# Level of significance = 0.05
import numpy as np
import pandas as pd
s = pd.crosstab(data.housing, data.Target, margins = False)
print(s)
# contingency table
table = s
print("Contingency table:")
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print("Note : dof = (rows-1)*(cols-1) = (2-1)*(2-1)")
print(expected)
# interpret test-statistic
null = 'having housing loan is not related to the client subscribing a term deposit'
alternate = 'having housing loan is related to the client subscribing a term deposit'
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
This column is about whether the customer has personal loan or not
print("Unique values of Loan column:", '\n',data['loan'].unique())
print("Individual count of Loan column: ")
data['loan'].value_counts()
plt.figure(figsize=(15,10))
plt.title("count plot of Loan column")
sns.countplot(data['loan'])
print("Individual count of Loan column who subscribed to term deposit: ")
data[data['Target']=='yes'].loan.value_counts()
plt.figure(figsize=(18,7))
plt.title("count plot of Loan column who subscribed to term deposit")
sns.countplot(data[data['Target']=='yes'].loan)
print("Individual count of loan column who did not subscribed to term deposit: ")
data[data['Target']=='no'].loan.value_counts()
plt.figure(figsize=(18,7))
plt.title("count plot of Loan column who did not subscribed to term deposit")
sns.countplot(data[data['Target']=='no'].loan)
Observation:
Lets do statistical analysis whether having personal loan is related to the client subscribed a term deposit or not.
Question: Is having personal loan related to the client subscriBING a term deposit or not. ?
NULL HYPOTHESIS(H0): having personal loan is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): having personal loan is related to the client subscribing a term deposit
# Level of significance = 0.05
import numpy as np
import pandas as pd
s = pd.crosstab(data.loan, data.Target, margins = False)
print(s)
# contingency table
table = s
print("Contingency table:")
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print("Note : dof = (rows-1)*(cols-1) = (2-1)*(2-1)")
print(expected)
# interpret test-statistic
null = 'having personal loan is not related to the client subscribing a term deposit'
alternate = 'having personal loan is related to the client subscribing a term deposit'
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
This column is about the communication type
print("Unique values of Contact column:", '\n',data['contact'].unique())
print("Individual count of Contact column: ")
data['contact'].value_counts()
plt.figure(figsize=(15,10))
plt.title("count plot of contact column")
sns.countplot(data['contact'])
print("Individual count of contact column who subscribed to term deposit: ")
data[data['Target']=='yes'].contact.value_counts()
plt.figure(figsize=(18,7))
plt.title("count plot of contact column who subscribed to term deposit")
sns.countplot(data[data['Target']=='yes'].contact)
print("Individual count of contact column who did not subscribed to term deposit: ")
data[data['Target']=='no'].contact.value_counts()
plt.figure(figsize=(18,7))
plt.title("count plot of contact column who did not subscribed to term deposit")
sns.countplot(data[data['Target']=='no'].contact)
Observation: Cellular mode of contact is the most happened mode of communication or contact
Lets do statistical analysis whether mode of communication with the customer is related to the client subscribed a term deposit or not.
Question: Is mode of communication with the customer related to the client subscriBING a term deposit or not. ?
NULL HYPOTHESIS(H0): mode of communication with the customer is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): mode of communication with the customer is related to the client subscribing a term deposit
# Level of significance = 0.05
import numpy as np
import pandas as pd
s = pd.crosstab(data.contact, data.Target, margins = False)
print(s)
# contingency table
table = s
print("Contingency table:")
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print("Note : dof = (rows-1)*(cols-1) = (2-1)*(2-1)")
print(expected)
# interpret test-statistic
null = 'mode of communication with the customer is not related to the client subscribing a term deposit'
alternate = 'mode of communication with the customer is related to the client subscribing a term deposit'
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
This column is about the last contact day of the month of customer
plt.figure(figsize=(15,10))
sns.set_color_codes()
ax = sns.distplot(data['day'], color="r")
plt.figure(figsize=(15,10))
ax = sns.distplot(data['day'], rug=True, rug_kws={"color": "g"},
kde_kws={"color": "k", "lw": 3, "label": "KDE"},
hist_kws={"histtype": "step", "linewidth": 3,
"alpha": 1, "color": "g"})
plt.figure(figsize=(20,5))
sns.countplot(data['day'])
Looking for outliers in last contact day of the month column
plt.figure(figsize=(15,10))
sns.boxplot(data['day'])
plt.figure(figsize=(15,5))
sns.boxplot(y=data['day'],x=data['Target'])
plt.figure(figsize=(15,5))
sns.violinplot(y=data['day'],x=data['Target'])
Lets do statistical analysis whether day of last contact is related to the client subscribed a term deposit or not.
Question: Is day of last contact related to the client subscriBING a term deposit or not. ?
print("Individual count of age column: "," "*500, data['day'].value_counts())
NULL HYPOTHESIS(H0): day of last contact with the customer is not related to him subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): day of last contact with the customer is related to him subscribing a term deposit
# Level of significance = 0.05
null = 'day of last contact with the customer is not related to him subscribing a term deposit'
alternate = 'day of last contact with the customer is related to him subscribing a term deposit'
a=np.array(data[data['Target']=='yes'].day)
b=np.array(data[data['Target']=='no'].day)
t_stat, p_value = stats.ttest_ind(a,b,axis=0)
print("the caluclated value of tstatistic is",t_stat)
print("The pvalue is",p_value)
print(""*1000)
print("Based on the statistical evidence")
if p_value > 0.05:
print("we fail to reject null hypothesis as the p_value", p_value, "is greater than 0.05")
print(null)
else:
print("we reject null hypothesis as the p_value", p_value, "is less than 0.05")
print(alternate)
This column is about the last contact month with customer
print("Unique values of month column:", '\n',data['month'].unique())
print("Individual count of month column: ")
data['month'].value_counts()
plt.figure(figsize=(15,10))
plt.title("count plot of month column")
sns.countplot(data['month'])
print("Individual count of month column who subscribed to term deposit: ")
data[data['Target']=='yes'].month.value_counts()
plt.figure(figsize=(18,7))
sns.countplot(data[data['Target']=='yes'].month)
print("Individual count of month column who did not subscribed to term deposit: ")
data[data['Target']=='no'].month.value_counts()
plt.figure(figsize=(18,7))
sns.countplot(data[data['Target']=='no'].month)
Lets do statistical analysis whether last contact month with the customer is related to the client subscribed a term deposit or not.
Question: Is last contact month with the customer related to the client subscriBING a term deposit or not. ?
NULL HYPOTHESIS(H0): last contact month with the customer is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): last contact month with the customer is related to the client subscribing a term deposit
# Level of significance = 0.05
import numpy as np
import pandas as pd
s = pd.crosstab(data.month, data.Target, margins = False)
print(s)
# contingency table
table = s
print("Contingency table:")
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print("Note : dof = (rows-1)*(cols-1) = (12-1)*(2-1)")
print(expected)
# interpret test-statistic
null = 'last contact month with the customer is not related to the client subscribing a term deposit'
alternate = 'last contact month with the customer is related to the client subscribing a term deposit'
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
This column tells the duration in seconds of contact
print("The 5 point summary of duration column would give the below results: ")
data['duration'].describe().T
plt.figure(figsize=(15,10))
sns.set_color_codes()
ax = sns.distplot(data['duration'], color="r")
ax = sns.distplot(data['duration'], rug=True, rug_kws={"color": "g"},
kde_kws={"color": "k", "lw": 3, "label": "KDE"},
hist_kws={"histtype": "step", "linewidth": 3,
"alpha": 1, "color": "g"})
Looking for outliers in duration column
plt.figure(figsize=(15,10))
sns.boxplot(data['duration'])
plt.figure(figsize=(15,5))
sns.boxplot(y=data['duration'],x=data['Target'])
Observation:
plt.figure(figsize=(15,5))
sns.violinplot(y=data['duration'],x=data['Target'])
Lets do statistical analysis whether time spoken to customer is related him subscribing to a term deposit or not.
Question: Is duration related to the client subscriBING a term deposit or not. ?
print("Individual count of duration column: "," "*500, data['duration'].value_counts())
NULL HYPOTHESIS(H0): Duration of call is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): Duration of call is related to the client subscribing a term deposit
# Level of significance = 0.05
null = 'Duration of call is not related to the client subscribing a term deposit'
alternate = 'Duration of call is related to the client subscribing a term deposit'
a=np.array(data[data['Target']=='yes'].duration)
b=np.array(data[data['Target']=='no'].duration)
t_stat, p_value = stats.ttest_ind(a,b,axis=0)
print("the caluclated value of tstatistic is",t_stat)
print("The pvalue is",p_value)
print(""*1000)
print("Based on the statistical evidence")
if p_value > 0.05:
print("we fail to reject null hypothesis as the p_value", p_value, "is greater than 0.05")
print(null)
else:
print("we reject null hypothesis as the p_value", p_value, "is less than 0.05")
print(alternate)
This column is about number of contacts performed during this campaign
print("The 5 point summary of campaign column would give the below results: ")
data['campaign'].describe().T
plt.figure(figsize=(15,10))
sns.set_color_codes()
ax = sns.distplot(data['campaign'], color="r")
plt.figure(figsize=(15,10))
ax = sns.distplot(data['campaign'], rug=True, rug_kws={"color": "g"},
kde_kws={"color": "k", "lw": 3, "label": "KDE"},
hist_kws={"histtype": "step", "linewidth": 3,
"alpha": 1, "color": "g"})
Looking for outliers in campaign column
plt.figure(figsize=(15,10))
sns.boxplot(data['campaign'])
plt.figure(figsize=(15,5))
sns.boxplot(y=data['campaign'],x=data['Target'])
plt.figure(figsize=(15,5))
sns.violinplot(y=data['campaign'],x=data['Target'])
Observation:
Lets do statistical analysis whether number of times spoken to customer is related him subscribing to a term deposit or not.
Question: Is number of times contacted related to the client subscriBING a term deposit or not. ?
print("Individual count of campaign column: "," "*500, data['campaign'].value_counts())
NULL HYPOTHESIS(H0): Number of times contacted is not related to him subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): Number of times contacted is related to him subscribing a term deposit
# Level of significance = 0.05
null = 'Number of times contacted is not related to him subscribing a term deposit'
alternate = 'Number of times contacted is related to him subscribing a term deposit'
a=np.array(data[data['Target']=='yes'].campaign)
b=np.array(data[data['Target']=='no'].campaign)
t_stat, p_value = stats.ttest_ind(a,b,axis=0)
print("the caluclated value of tstatistic is",t_stat)
print("The pvalue is",p_value)
print(""*1000)
print("Based on the statistical evidence")
if p_value > 0.05:
print("we fail to reject null hypothesis as the p_value", p_value, "is greater than 0.05")
print(null)
else:
print("we reject null hypothesis as the p_value", p_value, "is less than 0.05")
print(alternate)
number of days that passed by after the client was last contacted from a previous campaign
(numeric; 999 means client was not previously contacted)
data[data['pdays']==999]
print("Unique values of previous column:", '\n',data['previous'].unique())
print("Individual count of previous column: "," "*500, data['previous'].value_counts())
plt.figure(figsize=(15,10))
plt.title("count plot of previous column")
sns.countplot(data['previous'])
print("Individual count of previous column who subscribed to term deposit: ")
data[data['Target']=='yes'].previous.value_counts()
plt.figure(figsize=(18,7))
sns.countplot(data[data['Target']=='yes'].previous)
print("Individual count of previous column who did not subscribed to term deposit: ")
data[data['Target']=='no'].previous.value_counts()
plt.figure(figsize=(18,7))
sns.countplot(data[data['Target']=='no'].previous)
Lets do statistical analysis whether no of previous contacts with the customer is related to the client subscribed a term deposit or not.
Question: Is no of previous contacts with the customer related to the client subscriBING a term deposit or not. ?
NULL HYPOTHESIS(H0):no of previous contacts with the customer is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): no of previous contacts with the customer is related to the client subscribing a term deposit
# Level of significance = 0.05
import numpy as np
import pandas as pd
s = pd.crosstab(data.previous, data.Target, margins = False)
print(s)
# contingency table
table = s
print("Contingency table:")
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print("Note : dof = (rows-1)*(cols-1) = (41-1)*(2-1)")
print(expected)
# interpret test-statistic
null = 'no of previous contacts with the customer is not related to the client subscribing a term deposit'
alternate = 'no of previous contacts with the customer is related to the client subscribing a term deposit'
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
This column is about previous outcome
print("Unique values of previous outcome column:", '\n',data['poutcome'].unique())
print("Individual count of previous outcome column: ")
data['poutcome'].value_counts()
plt.figure(figsize=(15,10))
plt.title("count plot of previous outcome column")
sns.countplot(data['poutcome'])
print("Individual count of previous outcome column who subscribed to term deposit: ")
data[data['Target']=='yes'].poutcome.value_counts()
plt.figure(figsize=(18,7))
plt.title("count plot of previous outcome column who subscribed to term deposit")
sns.countplot(data[data['Target']=='yes'].poutcome)
print("Individual count of previous outcome column who did not subscribed to term deposit: ")
data[data['Target']=='no'].poutcome.value_counts()
plt.figure(figsize=(18,7))
plt.title("count plot of previous outcome column who did not subscribed to term deposit")
sns.countplot(data[data['Target']=='no'].poutcome)
Lets do statistical analysis whether outcome of the previous marketting campaign is related to the client subscribed a term deposit or not.
Question: Is outcome of the previous marketting campaign related to the client subscriBING a term deposit or not. ?
NULL HYPOTHESIS(H0): outcome of the previous marketting campaign is not related to the client subscribing a term deposit
ALTERNATIVE HYPOTHESIS(Ha): outcome of the previous marketting campaign is related to the client subscribing a term deposit
# Level of significance = 0.05
import numpy as np
import pandas as pd
s = pd.crosstab(data.poutcome, data.Target, margins = False)
print(s)
# contingency table
table = s
print("Contingency table:")
print(table)
stat, p, dof, expected = chi2_contingency(table)
print('dof=%d' % dof)
print("Note : dof = (rows-1)*(cols-1) = (4-1)*(2-1)")
print(expected)
# interpret test-statistic
null = 'outcome of the previous marketting campaign is not related to the client subscribing a term deposit'
alternate = 'outcome of the previous marketting campaign is related to the client subscribing a term deposit'
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
if abs(stat) >= critical:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
if p <= alpha:
print('Dependent (reject H0)')
print(alternate)
else:
print('Independent (fail to reject H0)')
print(null)
This is the column which we need to predict
column is about whether the customer has subscribe to a term deposit or not
print("The 5 point summary of Target column would give the below results: ")
data['Target'].describe()
print("The cont of each:")
print(data['Target'].value_counts())
Observation:
sns.countplot(data['Target'])
#age and loan Boxplot
plt.figure(figsize=(12,8))
plt.title('Age and loan Boxplot')
sns.boxplot(x='loan',y='age',data=data,hue='Target')
#age and housing loan Boxplot
plt.figure(figsize=(12,8))
plt.title('Age and housing loan Boxplot')
sns.boxplot(x='housing',y='age',data=data,hue='Target')
#age and default Boxplot
plt.figure(figsize=(12,8))
plt.title('Age and default Boxplot')
sns.boxplot(x='default',y='age',data=data,hue='Target')
#age and poutcome Boxplot
plt.figure(figsize=(20,12))
plt.title('Age and poutcome Boxplot')
sns.boxplot(x='poutcome',y='age',data=data,hue='Target')
#age and contact Boxplot
plt.figure(figsize=(20,12))
plt.title('Age and contact Boxplot')
sns.boxplot(x='contact',y='age',data=data,hue='Target')
Observation:
#age and housing loan Boxplot
plt.figure(figsize=(12,8))
plt.title('duration and poutcome Boxplot')
sns.boxplot(x="poutcome",y="duration",hue="Target",data=data, palette="coolwarm")
Observation:
cat=['job','marital','education','default','housing','loan','contact','month','poutcome','Target']
# Import label encoder
from sklearn import preprocessing
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
for i in cat:
# Encode labels for categorical variables.
data[i]= label_encoder.fit_transform(data[i])
data.corr()
from matplotlib import pyplot as plt
plt.figure(figsize=(16,16))
ax = sns.heatmap(data.corr(), vmax=.7, square=False, fmt='.2f', annot=True, linecolor='green', linewidths=0.4)
plt.title('Correlation heatmap')
plt.show()
Observation:
#Normalizing the data
cols_to_norm = ['age','balance','duration']
data[cols_to_norm]=data[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
X = data.drop(['Target'],axis=1) # Predictor feature columns
Y = data['Target'] # Predicted class
There is an imbalance in the dataset. As the count of those who subscribed to deposit is less when compared with ones who did not
data['Target'].value_counts()
# 0 - who did not subscribe to term deposit
# 1 - who did subscribe to term deposit
data.info()
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# 1 is just any random seed number
x_train.head()
# the below function is used so as to print heat map of confusion matrix and roc curve
def print_confusion_matrix(y_true, y_pred):
from sklearn.metrics import roc_curve, auc
print("Confusion Matrix")
cm=metrics.confusion_matrix(y_test, y_pred)
print(cm)
df_cm = pd.DataFrame(cm)
plt.figure(figsize = (6,6))
sns.heatmap(df_cm, annot=True,fmt='g')
plt.show()
print('True negative = ', cm[0][0])
print('False negative = ', cm[0][1])
print('False positive = ', cm[1][0])
print('True positive = ', cm[1][1])
print(" "*100)
print(" "*100)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr,tpr)
# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b',label='AUC = %0.3f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
print(" "*100)
print("auc:", roc_auc)
leaf_size = [30,40]
n_neighbors = [1,5,21,51,101]
#Power parameter for the Minkowski metric ----p
p=[1,2]
#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
#Create new KNN object
knn = KNeighborsClassifier()
#Use GridSearch
clf = GridSearchCV(knn, hyperparameters, cv=3)
#Fit the model
model = clf.fit(x_train,y_train)
print('Best leaf_size:', model.best_estimator_.get_params()['leaf_size'])
print('Best p:', model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', model.best_estimator_.get_params()['n_neighbors'])
from sklearn.metrics import roc_curve, auc
k=model.best_estimator_.get_params()['n_neighbors']
l=model.best_estimator_.get_params()['leaf_size']
P=model.best_estimator_.get_params()['p']
knn=KNeighborsClassifier(n_neighbors=P,leaf_size=l,p=P)
knn.fit(x_train,y_train)
predicted_labels = knn.predict(x_test)
knn.score(x_test, y_test)
print("Accuracy: ",knn.score(x_test, y_test))
print(" "*100)
print_confusion_matrix(y_test, predicted_labels)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, predicted_labels, target_names=target_names))
C=[0.01, 0.1, 1, 5,10, 20, 30]
penalty=['l2','l1']
hyperparameters=dict(C=C, penalty=penalty)
lr=LogisticRegression()
clf = GridSearchCV(lr, hyperparameters, cv=3, verbose=0)
best_model=clf.fit(x_train,y_train)
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])
best_penalty=best_model.best_estimator_.get_params()['penalty']
best_c=best_model.best_estimator_.get_params()['C']
lr=LogisticRegression(C=best_c,penalty=best_penalty)
lr.fit(x_train,y_train)
predicted_labels = lr.predict(x_test)
lr.score(x_test, y_test)
print("Accuracy: ",lr.score(x_test, y_test))
print(" "*100)
print_confusion_matrix(y_test, predicted_labels)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, predicted_labels, target_names=target_names))
gnb = GaussianNB()
gnb.fit(x_train,y_train)
predicted_labels=gnb.predict(x_test)
print("Accuracy: ",gnb.score(x_test, y_test))
print(" "*100)
print("Number of mislabeled points out of a total %d points : %d"
% (x_test.shape[0], (y_test != predicted_labels).sum()))
print(" "*100)
print_confusion_matrix(y_test, predicted_labels)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, predicted_labels, target_names=target_names))
from sklearn.svm import SVC
# defining parameter range
param_grid = {'C': [0.01, 0.1, 1],
'gamma': [10, 1, 0.1],
'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, cv=3,verbose = 3)
# fitting the model for grid search
best_model=grid.fit(x_train, y_train)
# View best hyperparameters
print('Best gamma:', best_model.best_estimator_.get_params()['gamma'])
print('Best C:', best_model.best_estimator_.get_params()['C'])
print('Best kernel:', best_model.best_estimator_.get_params()['kernel'])
best_gamma=best_model.best_estimator_.get_params()['gamma']
best_c=best_model.best_estimator_.get_params()['C']
best_kernel=best_model.best_estimator_.get_params()['kernel']
print("Best Score:", best_model.best_score_)
svm=SVC(C=best_c,gamma=best_gamma,kernel=best_kernel)
svm.fit(x_train,y_train)
predicted_labels = svm.predict(x_test)
print("Accuracy: ",svm.score(x_test, y_test))
print(" "*100)
from sklearn.metrics import roc_curve, auc
print_confusion_matrix(y_test, predicted_labels)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, predicted_labels, target_names=target_names))
# Separate majority and minority classes
data_majority = data[data['Target']==0]
data_minority = data[data['Target']==1]
# Upsample minority class
data_minority_upsampled = resample(data_minority,
replace=True, # sample with replacement
n_samples=39922, # to match majority class
random_state=123) # reproducible results
# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_minority_upsampled])
# Display new class counts
data_upsampled['Target'].value_counts()
#Normalizing the data
cols_to_norm = ['age','balance','duration']
data_upsampled[cols_to_norm]=data_upsampled[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
X = data_upsampled.drop(['Target'],axis=1) # Predictor feature columns
Y = data_upsampled['Target'] # Predicted class
#splitting the dataa
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number
x_train.head()
leaf_size = [30,40]
n_neighbors = [1,5,21,51,101]
#Power parameter for the Minkowski metric ----p
p=[1,2]
#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
#Create new KNN object
knn = KNeighborsClassifier()
#Use GridSearch
clf = GridSearchCV(knn, hyperparameters, cv=3)
#Fit the model
model = clf.fit(x_train,y_train)
print('Best leaf_size:', model.best_estimator_.get_params()['leaf_size'])
print('Best p:', model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', model.best_estimator_.get_params()['n_neighbors'])
k=model.best_estimator_.get_params()['n_neighbors']
l=model.best_estimator_.get_params()['leaf_size']
P=model.best_estimator_.get_params()['p']
knn=KNeighborsClassifier(n_neighbors=P,leaf_size=l,p=P)
knn.fit(x_train,y_train)
predicted_labels = knn.predict(x_test)
knn.score(x_test, y_test)
print("Accuracy: ",knn.score(x_test, y_test))
print(" "*100)
print_confusion_matrix(y_test, predicted_labels)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, predicted_labels, target_names=target_names))
C=[0.01, 0.1, 1, 5,10, 20, 30]
penalty=['l2','l1']
hyperparameters=dict(C=C, penalty=penalty)
lr=LogisticRegression()
clf = GridSearchCV(lr, hyperparameters, cv=3, verbose=0)
best_model=clf.fit(x_train,y_train)
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])
best_penalty=best_model.best_estimator_.get_params()['penalty']
best_c=best_model.best_estimator_.get_params()['C']
lr=LogisticRegression(C=best_c,penalty=best_penalty)
lr.fit(x_train,y_train)
predicted_labels = lr.predict(x_test)
lr.score(x_test, y_test)
print("Accuracy: ",lr.score(x_test, y_test))
print(" "*100)
print_confusion_matrix(y_test, predicted_labels)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, predicted_labels, target_names=target_names))
gnb = GaussianNB()
gnb.fit(x_train,y_train)
predicted_labels=gnb.predict(x_test)
print("Accuracy: ",gnb.score(x_test, y_test))
print(" "*100)
print("Number of mislabeled points out of a total %d points : %d"
% (x_test.shape[0], (y_test != predicted_labels).sum()))
print(" "*100)
print_confusion_matrix(y_test, predicted_labels)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, predicted_labels, target_names=target_names))
from sklearn.svm import SVC
# defining parameter range
param_grid = {'C': [0.01, 0.1, 1],
'gamma': [10, 1, 0.1],
'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, cv=3, verbose = 3)
# fitting the model for grid search
best_model=grid.fit(x_train, y_train)
# View best hyperparameters
print('Best gamma:', best_model.best_estimator_.get_params()['gamma'])
print('Best C:', best_model.best_estimator_.get_params()['C'])
print('Best kernel:', best_model.best_estimator_.get_params()['kernel'])
best_gamma=best_model.best_estimator_.get_params()['gamma']
best_c=best_model.best_estimator_.get_params()['C']
best_kernel=best_model.best_estimator_.get_params()['kernel']
print("Best Score:", best_model.best_score_)
svm=SVC(C=best_c,gamma=best_gamma,kernel=best_kernel)
svm.fit(x_train,y_train)
predicted_labels = svm.predict(x_test)
print("Accuracy: ",svm.score(x_test, y_test))
print(" "*100)
from sklearn.metrics import roc_curve, auc
print_confusion_matrix(y_test, predicted_labels)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, predicted_labels, target_names=target_names))
#reading data
data=pd.read_csv('/content/bank-full.csv')
cat=['job','marital','education','default','housing','loan','contact','month','poutcome','Target']
# Import label encoder
from sklearn import preprocessing
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
for i in cat:
# Encode labels for categorical variables.
data[i]= label_encoder.fit_transform(data[i])
#Normalizing the data
cols_to_norm = ['age','balance','duration']
data[cols_to_norm]=data[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
X = data.drop(['Target'],axis=1) # Predictor feature columns
Y = data['Target'] # Predicted class
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
# 1 is just any random seed number
from sklearn.tree import DecisionTreeClassifier
dTree = DecisionTreeClassifier(criterion = 'gini', max_depth=12, random_state=1)
dTree.fit(x_train, y_train)
print("train score",dTree.score(x_train, y_train))
print("Test Score",dTree.score(x_test, y_test))
Visualizing the tree
train_char_label = ['No', 'Yes']
dot_data = StringIO()
export_graphviz(dTree, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = list(x_train),class_names=train_char_label)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('tree.png')
Image(graph.create_png())
from sklearn.metrics import accuracy_score
preds_pruned = dTree.predict(x_test)
preds_pruned_train = dTree.predict(x_train)
print("Accuracy test Score",accuracy_score(y_test,preds_pruned))
print("Accuracy train Score",accuracy_score(y_train,preds_pruned_train))
acc_DT = accuracy_score(y_test, preds_pruned)
#Store the accuracy results for each model in a dataframe for final comparison
resultsDf = pd.DataFrame({'Method':['Decision Tree'], 'accuracy': acc_DT})
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dTreeR.fit(x_train, y_train)
print("Train Score",dTreeR.score(x_train, y_train))
print("Test Score",dTreeR.score(x_test, y_test))
train_char_label = ['No', 'Yes']
dot_dat = StringIO()
export_graphviz(dTreeR, out_file=dot_dat,
filled=True, rounded=True,
special_characters=True,feature_names = list(x_train),class_names=train_char_label)
graph = pydotplus.graph_from_dot_data(dot_dat.getvalue())
graph.write_png('treeR.png')
Image(graph.create_png())
# importance of features in the tree building ( The importance of a feature is computed as the
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print (pd.DataFrame(dTreeR.feature_importances_, columns = ["Imp"], index = x_train.columns))
print(dTreeR.score(x_test , y_test))
y_predict = dTreeR.predict(x_test)
print_confusion_matrix(y_test, y_predict)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, y_predict, target_names=target_names))
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dTree, n_estimators=50,random_state=1)
#bgcl = BaggingClassifier(n_estimators=50,random_state=1)
bgcl = bgcl.fit(x_train, y_train)
y_predict = bgcl.predict(x_test)
acc_BG = accuracy_score(y_test, y_predict)
print("Test Score: ",bgcl.score(x_test , y_test))
print_confusion_matrix(y_test, y_predict)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, y_predict, target_names=target_names))
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'accuracy': [acc_BG]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
resultsDf
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=10, random_state=1)
#abcl = AdaBoostClassifier( n_estimators=50,random_state=1)
abcl = abcl.fit(x_train, y_train)
y_predict = abcl.predict(x_test)
acc_AB = accuracy_score(y_test, y_predict)
print("Test Score: ",abcl.score(x_test , y_test))
print_confusion_matrix(y_test, y_predict)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, y_predict, target_names=target_names))
tempResultsDf = pd.DataFrame({'Method':['Adaboost'], 'accuracy': [acc_AB]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
resultsDf
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50)
rfcl = rfcl.fit(x_train, y_train)
y_predict = rfcl.predict(x_test)
acc_RF = accuracy_score(y_test, y_predict)
print(rfcl.score(x_test, y_test))
print_confusion_matrix(y_test, y_predict)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, y_predict, target_names=target_names))
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'accuracy': [acc_RF]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
resultsDf
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22)
gbcl = gbcl.fit(x_train, y_train)
y_predict = rfcl.predict(x_test)
acc_GB = accuracy_score(y_test, y_predict)
print(rfcl.score(x_test, y_test))
print_confusion_matrix(y_test, y_predict)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, y_predict, target_names=target_names))
tempResultsDf = pd.DataFrame({'Method':['Gradient Boost'], 'accuracy': [acc_GB]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'accuracy']]
resultsDf
resultsDf
# Separate majority and minority classes
data_majority = data[data['Target']==0]
data_minority = data[data['Target']==1]
# Upsample minority class
data_minority_upsampled = resample(data_minority,
replace=True, # sample with replacement
n_samples=39922, # to match majority class
random_state=123) # reproducible results
# Combine majority class with upsampled minority class
data_upsampled = pd.concat([data_majority, data_minority_upsampled])
# Display new class counts
data_upsampled['Target'].value_counts()
#Normalizing the data
cols_to_norm = ['age','balance','duration']
data_upsampled[cols_to_norm]=data_upsampled[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
X = data_upsampled.drop(['Target'],axis=1) # Predictor feature columns
Y = data_upsampled['Target'] # Predicted class
#splitting the dataa
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number
x_train.head()
from sklearn.tree import DecisionTreeClassifier
dTree = DecisionTreeClassifier(criterion = 'gini', max_depth=12, random_state=1)
dTree.fit(x_train, y_train)
print("train score",dTree.score(x_train, y_train))
print("Test Score",dTree.score(x_test, y_test))
Visualizing the tree
train_char_label = ['No', 'Yes']
dot_data = StringIO()
export_graphviz(dTree, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = list(x_train),class_names=train_char_label)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('tree.png')
Image(graph.create_png())
from sklearn.metrics import accuracy_score
preds_pruned = dTree.predict(x_test)
preds_pruned_train = dTree.predict(x_train)
print("Accuracy test Score",accuracy_score(y_test,preds_pruned))
print("Accuracy train Score",accuracy_score(y_train,preds_pruned_train))
acc_DT = accuracy_score(y_test, preds_pruned)
#Store the accuracy results for each model in a dataframe for final comparison
resultsD = pd.DataFrame({'Method':['Decision Tree'], 'accuracy': acc_DT})
resultsD = resultsD[['Method', 'accuracy']]
resultsD
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dTreeR.fit(x_train, y_train)
print("Train Score",dTreeR.score(x_train, y_train))
print("Test Score",dTreeR.score(x_test, y_test))
train_char_label = ['No', 'Yes']
dot_dat = StringIO()
export_graphviz(dTreeR, out_file=dot_dat,
filled=True, rounded=True,
special_characters=True,feature_names = list(x_train),class_names=train_char_label)
graph = pydotplus.graph_from_dot_data(dot_dat.getvalue())
graph.write_png('treeR.png')
Image(graph.create_png())
# importance of features in the tree building ( The importance of a feature is computed as the
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
print (pd.DataFrame(dTreeR.feature_importances_, columns = ["Imp"], index = x_train.columns))
print(dTreeR.score(x_test , y_test))
y_predict = dTreeR.predict(x_test)
print_confusion_matrix(y_test,y_predict)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, y_predict, target_names=target_names))
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(base_estimator=dTree, n_estimators=50,random_state=1)
#bgcl = BaggingClassifier(n_estimators=50,random_state=1)
bgcl = bgcl.fit(x_train, y_train)
y_predict = bgcl.predict(x_test)
acc_BG = accuracy_score(y_test, y_predict)
print("Test Score: ",bgcl.score(x_test , y_test))
print_confusion_matrix(y_test,y_predict)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, y_predict, target_names=target_names))
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'accuracy': [acc_BG]})
resultsD = pd.concat([resultsD, tempResultsDf])
resultsD = resultsD[['Method', 'accuracy']]
resultsD
resultsD
from sklearn.ensemble import AdaBoostClassifier
abcl = AdaBoostClassifier(n_estimators=10, random_state=1)
#abcl = AdaBoostClassifier( n_estimators=50,random_state=1)
abcl = abcl.fit(x_train, y_train)
y_predict = abcl.predict(x_test)
acc_AB = accuracy_score(y_test, y_predict)
print("Test Score: ",abcl.score(x_test , y_test))
print_confusion_matrix(y_test,y_predict)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, y_predict, target_names=target_names))
tempResultsDf = pd.DataFrame({'Method':['Adaboost'], 'accuracy': [acc_AB]})
resultsD = pd.concat([resultsD, tempResultsDf])
resultsD = resultsD[['Method', 'accuracy']]
resultsD
resultsD
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 50)
rfcl = rfcl.fit(x_train, y_train)
y_predict = rfcl.predict(x_test)
acc_RF = accuracy_score(y_test, y_predict)
print(rfcl.score(x_test, y_test))
print_confusion_matrix(y_test,y_predict)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, y_predict, target_names=target_names))
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'accuracy': [acc_RF]})
resultsD = pd.concat([resultsD, tempResultsDf])
resultsD = resultsD[['Method', 'accuracy']]
resultsD
resultsD
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.1, random_state=22)
gbcl = gbcl.fit(x_train, y_train)
y_predict = rfcl.predict(x_test)
acc_GB = accuracy_score(y_test, y_predict)
print(rfcl.score(x_test, y_test))
print_confusion_matrix(y_test,y_predict)
target_names = ['class 0', 'class 1']
print(" "*100)
print("classification report for test data:")
print(classification_report(y_test, y_predict, target_names=target_names))
tempResultsDf = pd.DataFrame({'Method':['Gradient Boost'], 'accuracy': [acc_GB]})
resultsD = pd.concat([resultsD, tempResultsDf])
resultsD = resultsD[['Method', 'accuracy']]
resultsD
resultsD
from prettytable import PrettyTable
x = PrettyTable()
print("Label Encoding")
print("Before Upsampling")
x.field_names = ["Model", "Test Accuracy","Auc"]
x.add_row(["KNN", 0.86,0.622])
x.add_row(["Logistic REgression", 0.88,0.501])
x.add_row(["Naive Bayes", 0.827,0.674])
x.add_row(["SVM", 0.88,0.537])
print(x)
x = PrettyTable()
print("After Upsampling")
x.field_names = ["Model", "Test Accuracy","Auc"]
x.add_row(["KNN", 0.95,0.956])
x.add_row(["Logistic REgression", 0.67,0.679])
x.add_row(["Naive Bayes", 0.74,0.749])
x.add_row(["SVM", 0.97,0.973])
print(x)
print("Ensemble Models - Before Upsampling")
x.field_names = ["Model", "Test Accuracy","Auc"]
x.add_row(["Decesion Tree", 0.89,0.592])
x.add_row(["Bagging", 0.90,0.703])
x.add_row(["AdaBossting", 0.88,0.634])
x.add_row(["RandomForest", 0.90,0.692])
x.add_row(["Gradient Bossting Classifier",0.90,0.69])
print(x)
x = PrettyTable()
print("Ensemble Models - After Upsampling")
x.field_names = ["Model", "Test Accuracy","Auc"]
x.add_row(["Decesion Tree", 0.88, 0.788])
x.add_row(["Bagging", 0.908,0.909])
x.add_row(["AdaBossting", 0.78,0.788])
x.add_row(["RandomForest", 0.96,0.968])
x.add_row(["Gradient Bossting Classifier",0.96,0.968])
print(x)
Observation - Label Encoding